Layernormgrad

计算 Layer Normalization 操作的梯度。该算子是 Layer Normalization 的反向传播部分,用于计算损失函数相对于输入 x、以及可学习参数 gamma 和 beta 的梯度。

\[ \begin{align}\begin{aligned}\text{dg}_i = \sum_{j} \text{dy}_j \cdot \frac{x_j - \mu}{\sqrt{\sigma^2 + \epsilon}}\\\text{db}_i = \sum_{j} \text{dy}_j\\\text{dx}_i = f(\text{dy}, x, \gamma, \mu, \sigma^2)\end{aligned}\end{align} \]

其中 :math:mu 是均值,:math:sigma^2 是方差,:math:epsilon 是一个为了防止除零而添加的极小值。dx 的计算较为复杂,它依赖于 dy、x 和 gamma。

输入:
  • x - 前向传播时的输入数据地址。

  • dy - 后续层反向传播回来的梯度数据地址。

  • params - 参数打包成数组:
    • var - 前向传播时计算出的方差(variance)地址。

    • mean - 前向传播时计算出的均值(mean)地址。

    • gamma - 前向传播时使用的可学习缩放参数 :math:gamma 地址。

    • dg - 输出,计算出的关于 gamma 的梯度地址。

    • db - 输出,计算出的关于 beta 的梯度地址。

    • param_num - 特征维度的大小,也是 gamma 和 beta 的大小。

    • param_size - 进行独立归一化的单元数量(例如批处理大小 Batch Size)。

    • block_num - 块的数量(通常等于 param_size)。

    • block_size - 每个块的大小(通常等于 param_num)。

  • core_mask - 核掩码(仅共享存储版本需要)。

输出:
  • dx - 计算出的关于输入 x 的梯度地址。

  • dg - 计算出的关于参数 gamma 的梯度地址。

  • db - 计算出的关于参数 beta 的梯度地址。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持fp32

  • MT7004 支持fp16, fp32

共享存储版本:

void hp_layer_norm_grad_s(half *x, half *dy, half *dx, long long *params, int core_mask)
void fp_layer_norm_grad_s(float *x, float *dy, float *dx, long long *params, int core_mask)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <layernormgrad.h> // 假设头文件名为 layernormgrad.h
 4
 5int main(int argc, char* argv[]) {
 6    float *x = (float *)0x81000000;
 7    float *dy = (float *)0x82000000;
 8    float *var = (float *)0x83000000;
 9    float *mean = (float *)0x84000000;
10    float *gamma = (float *)0x85000000;
11
12    int param_num = 8;
13    int param_size = 128;
14    int block_num = param_size;
15    int block_size = param_num;
16
17    float *dx = (float *)0x86000000;
18    float *dg = (float *)0x87000000;
19    float *db = (float *)0x88000000;
20    float *check_dx = (float *)0x89000000;
21    float *check_dg = (float *)0x8A000000;
22    float *check_db = (float *)0x8B000000;
23
24    int i = 0;
25
26    srand(seed++);
27
28    float f_min = 1.0;
29    float f_max = 2.0;
30
31    for(i = 0; i < param_num * param_size; ++i) {
32        x[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
33        dy[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
34    }
35
36    for(i = 0; i < block_num; i ++) {
37        var[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
38        mean[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
39    }
40
41    for(i = 0; i < param_num; i ++) {
42        gamma[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
43    }
44
45    long long params[12];
46    params[0] = (long long)var;
47    params[1] = (long long)mean;
48    params[2] = (long long)gamma;
49    params[3] = (long long)dg;
50    params[4] = (long long)db;
51    params[5] = (long long)param_num;
52    params[6] = (long long)param_size;
53    params[7] = (long long)block_num;
54    params[8] = (long long)block_size;
55
56    int core_mask = 0b1111;
57    fp_layer_norm_grad_s(x, dy, dx, (long long *)params, core_mask);//调用汇编
58    return 0;
59}

私有存储版本:

void hp_layer_norm_grad_p(half *x, half *dy, half *dx, long long *params)
void fp_layer_norm_grad_p(float *x, float *dy, float *dx, long long *params)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <layernormgrad.h> // 假设头文件名为 layernormgrad.h
 4
 5int main(int argc, char* argv[]) {
 6    float *x = (float *)0x10010000;
 7    float *dy = (float *)0x10020000;
 8    float *var = (float *)0x10030000;
 9    float *mean = (float *)0x10040000;
10    float *gamma = (float *)0x10050000;
11
12    int param_num = 8;
13    int param_size = 128;
14    int block_num = param_size;
15    int block_size = param_num;
16
17    float *dx = (float *)0x10016000;
18    float *dg = (float *)0x10026000;
19    float *db = (float *)0x10036000;
20    float *check_dx = (float *)0x10045000;
21    float *check_dg = (float *)0x10055000;
22    float *check_db = (float *)0x10060000;
23
24    int i = 0;
25
26    srand(seed++);
27
28    float f_min = 1.0;
29    float f_max = 2.0;
30
31    for(i = 0; i < param_num * param_size; ++i) {
32        x[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
33        dy[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
34    }
35
36    for(i = 0; i < block_num; i ++) {
37        var[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
38        mean[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
39    }
40
41    for(i = 0; i < param_num; i ++) {
42        gamma[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
43    }
44
45    long long params[12];
46    params[0] = (long long)var;
47    params[1] = (long long)mean;
48    params[2] = (long long)gamma;
49    params[3] = (long long)dg;
50    params[4] = (long long)db;
51    params[5] = (long long)param_num;
52    params[6] = (long long)param_size;
53    params[7] = (long long)block_num;
54    params[8] = (long long)block_size;
55
56    fp_layer_norm_grad_p(x, dy, dx, (long long *)params);//调用汇编
57    return 0;
58}